library(ggplot2)
library(stringi)
library(gridExtra)
library(kableExtra)
library(psych)
library(limma)
library(tidyverse)
library(CONSTANd)  # install from source: https://github.com/PDiracDelta/CONSTANd/
library(NOMAD)  # devtools::install_github("carlmurie/NOMAD")

This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.

We will check how varying analysis components [summarization/normalization/differential abundance testing methods] changes end results of a quantitative proteomic study.

source('./other_functions.R')
source('./plotting_functions.R')

# you should either make a symbolic link in this directory
study.design=read.delim('msstatstmt_studydesign.csv')
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# dat.w <- data.list$dat.w # data in wide format
if ('X' %in% colnames(dat.l)) { dat.l$X <- NULL }

# remove shared peptides
shared.peptides <- dat.l %>% filter(!shared.peptide)

# keep spectra with isolation interference <30 and no missing quantification channels
dat.l <- dat.l %>% filter(isoInterOk & noNAs)

# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character

# which peptides were identified in each MS run?
unique.pep=dat.l %>% 
  group_by(Run) %>%
  distinct(Peptide) %>% 
  mutate(val=1)
unique.pep <- xtabs(val~Peptide+Run, data=unique.pep)
tmp <- apply(unique.pep, 1, function(x) all(x==1))
inner.peptides <- rownames(unique.pep)[tmp]
# specify # of varying component variants and their names
n.comp.variants <- 3
variant.names <- c('CONSTANd', 'NOMAD', 'medianSweeping')
scale.vec <- c('log', 'log', 'log')
# pick reference channel and condition for making plots / doing DEA
quanCols <- unique(dat.l$Channel)
referenceChannel <- '127C'
referenceCondition <- '0.5'

1 Unit component

1.1 log2 transformation of reporter ion intensities

dat.unit.l <- dat.l %>% mutate(response=Intensity) %>% select(-Intensity)
# dat.unit.l <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)

2 Normalization component

# switch to wide format
dat.unit.w <- pivot_wider(data = dat.unit.l, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)
# dat.unit.w2 <- lapply(dat.unit.l, function(x) {
#   pivot_wider(data = x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=c('Run','Channel'), values_from=response)
# })
dat.norm.w <- emptyList(variant.names)

2.1 CONSTANd

# dat.unit.l entries are in long format so all have same colnames and no quanCols
x.split <- split(dat.unit.w, dat.unit.w$Run)  # apply CONSTANd to each Run separately
x.split.norm  <- lapply(x.split, function(y) {
  y[,quanCols] <- CONSTANd(y[,quanCols])$normalized_data
  return(y)
})
dat.norm.w$CONSTANd <- bind_rows(x.split.norm)

2.2 NOMAD

We apply NOMAD on the PSM level instead of the peptide level.

# doRobust=F: use means, like CONSTANd; doLog=F: values are already transformed.
dat.nomadnorm <- nomadNormalization(dat.unit.l$response, dat.unit.l %>% rename(iTRAQ=Channel), doRobust = FALSE, doiTRAQCorrection = FALSE, doLog = TRUE)
## Running normalization with  464224  number of data points
## Normalizing for factor:  Peptide 
## Normalizing for factor:  Run 
## Normalizing for factor:  iTRAQ 
## Normalizing for factor:  Run Peptide 
## Normalizing for factor:  Run iTRAQ
dat.nomadnorm$x$response <- dat.nomadnorm$y
dat.norm.w$NOMAD <- pivot_wider(data = dat.nomadnorm$x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=iTRAQ, values_from=response)
# get rid of factors
#factornames <- names(dat.norm.w$NOMAD)[sapply(dat.norm.w$NOMAD, is.factor)]
#dat.norm.w$NOMAD <- dat.norm.w$NOMAD %>% mutate(across(factornames, remove_factors))

2.3 medianSweeping

# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w$medianSweeping <- dat.unit.w
dat.norm.w$medianSweeping[,quanCols] <- sweep(dat.norm.w$medianSweeping[,quanCols], 1, apply(dat.norm.w$medianSweeping[,quanCols], 1, median) )

3 Summarization component

Summarize quantification values from PSM to peptide (first step) to protein (second step).

3.1 Median summarization (PSM to peptide to protein)

# normalized data
dat.norm.summ.w <- lapply(dat.norm.w, function(x) {
  # group by (run,)protein,peptide then summarize twice (once on each level)
  # add select() statement because summarise_at is going bananas over character columns
  y <- x %>% group_by(Run, Protein, Peptide) %>% select(Run, Protein, Peptide, quanCols) %>% summarise_at(.vars = quanCols, .funs = median) %>% select(Run, Protein, quanCols) %>% summarise_at(.vars = quanCols, .funs = median) %>% ungroup()
  return(y)
})

Notice that the row sums are not equal to Ncols anymore, because the median summarization does not preserve them (but mean summarization does).

Let’s also summarize the non-normalized data for comparison in the next section.

# non-normalized data
# add select() statement because summarise_at is going bananas over character columns
dat.nonnorm.summ.w <- dat.unit.w %>% group_by(Run, Protein, Peptide) %>% select(Run, Protein, Peptide, quanCols) %>% summarise_at(.vars = quanCols, .funs = median) %>% select(Run, Protein, quanCols) %>% summarise_at(.vars = quanCols, .funs = median) %>% ungroup()
# medianSweeping: in each channel, subtract median computed across all proteins within a channel
# do the above separately for each MS run
x.split <- split(dat.norm.summ.w$medianSweeping, dat.norm.summ.w$medianSweeping$Run)  
x.split.norm  <- lapply(x.split, function(y) {
  y[,quanCols] <- sweep(y[,quanCols], 2, apply(y[,quanCols], 2, median) )
  return(y)
})
dat.norm.summ.w$medianSweeping <- bind_rows(x.split.norm)

4 QC plots

# make data completely wide (also across runs)

## non-normalized data
dat.nonnorm.summ.w2 <- dat.nonnorm.summ.w %>% pivot_wider(names_from = Run, values_from = all_of(quanCols))

## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
  dat.tmp <- x %>% pivot_wider(names_from = Run, values_from = all_of(quanCols))
  return(dat.tmp)
})

# make vectors with condition labels and color coding corresponding to samples in wide format data
colors.condition <- tribble(
  ~Condition, ~Col,
  "0.125", 'black',
  "0.5", 'blue',
  "0.667", 'green',
  "1", 'red'
)
run_channel_condition <- expand_grid(Channel=quanCols, Run=unique(study.design$Run)) %>% left_join(study.design, by=c('Channel', 'Run')) %>% select(Run, Channel, Condition)
colors.condition.map <- run_channel_condition %>% unite(Channel, Channel:Run) %>% left_join(colors.condition, by='Condition')
ord <- match(colnames(dat.norm.summ.w2[[1]]), colors.condition.map$Channel)
ord <- ord[!is.na(ord)]  # drop first entry which is NA

# important: these two vectors contain colors and condition labels corresponding to data in wide2 format
cols.vec <- colors.condition.map[ord, 'Col']  %>% pull
conditions.vec <- colors.condition.map[ord, 'Condition']  %>% pull

4.1 Boxplots

# use (half-)wide format
par(mfrow=c(2,2))
  boxplot.w(dat.nonnorm.summ.w,study.design, 'Raw')
  
  for (i in 1: n.comp.variants){
    boxplot.w(dat.norm.summ.w[[i]], study.design, paste('Normalized', variant.names[i], sep='_'))
  }

par(mfrow=c(1,1))

4.2 MA plots

MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).

# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot.ils function 
# use wide2 format
p <- vector('list', n.comp.variants+1)
p[[1]] <- maplot.ils(dat.nonnorm.summ.w2, '127C_Mixture2_1', '129N_Mixture1_2', scale.vec[i], paste('Raw', variant.names[i], sep='_'))

for (i in 1: n.comp.variants){
 p[[i+1]]<- maplot.ils(dat.norm.summ.w2[[i]], '127C_Mixture2_1', '129N_Mixture1_2', scale.vec[i], paste('Normalized', variant.names[i], sep='_'))
}
grid.arrange(grobs = p, ncol=2, nrow=2)  

MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).

# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot.ils function 
channels.num <- colors.condition.map %>% filter(Condition=='1') %>% select(Channel) %>% pull
channels.denom <- colors.condition.map %>% filter(Condition=='0.125') %>% select(Channel) %>% pull

p <- vector('list', n.comp.variants+1)
p[[1]] <- maplot.ils(dat.nonnorm.summ.w2, channels.num, channels.denom, scale.vec[i], 'Raw')

for (i in 1: n.comp.variants){
 p[[i+1]]<- maplot.ils(dat.norm.summ.w2[[i]], channels.num, channels.denom, scale.vec[i], paste('Normalized', variant.names[i], sep='_'))
}
grid.arrange(grobs = p, ncol=2, nrow=2)  

4.3 CV (coefficient of variation) plots

dat.nonnorm.summ.l <- lapply(list(dat.nonnorm.summ.w), function(x){
  x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
  x <- to_long_format(x, study.design)
})

dat.norm.summ.l <- lapply(dat.norm.summ.w, function(x){
  x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
  x <- to_long_format(x, study.design)
})

par(mfrow=c(2, 2))
  cvplot.ils(dat=dat.nonnorm.summ.l[[1]], feature.group='Protein', xaxis.group='Condition',
               title='Raw', abs=F)

for (i in 1: n.comp.variants){
    cvplot.ils(dat=dat.norm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition', 
               title=paste('Normalized', variant.names[i], sep='_'), abs=F)
}

par(mfrow=c(1, 1))  

4.4 PCA plots

4.4.1 Using all proteins

# create a shorter version of run variable to present on legend ([-1] to avoid Protein col)
run.labels <- stri_replace(unlist(lapply(stri_split(colnames(dat.norm.summ.w2[[1]])[-1], fixed='_'), function(x) paste(x[2],x[3],sep = '_'))), fixed='Mixture', 'Mix')
par(mfrow=c(2, 2))
  pcaplot.ils(dat.nonnorm.summ.w2 %>% select(-'Protein'), run.labels, conditions.vec, cols.vec, 'Raw')
  
for (i in seq_along(dat.norm.summ.w2)){
  # select only the spiked.proteins
  pcaplot.ils(dat.norm.summ.w2[[i]] %>% select(-'Protein'), run.labels, conditions.vec, cols.vec, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))  

4.4.2 Using spiked proteins only

# create a shorter version of run variable to present on legend ([-1] to avoid Protein col)
run.labels <- stri_replace(unlist(lapply(stri_split(colnames(dat.norm.summ.w2[[1]])[-1], fixed='_'), function(x) paste(x[2],x[3],sep = '_'))), fixed='Mixture', 'Mix')
par(mfrow=c(2, 2))
  pcaplot.ils(dat.nonnorm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), run.labels, conditions.vec, cols.vec, 'Raw')
  
for (i in seq_along(dat.norm.summ.w2)){
  # select only the spiked.proteins
  pcaplot.ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), run.labels, conditions.vec, cols.vec, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))  

4.5 HC (hierarchical clustering) plots

Only use spiked proteins

TO DO: - also use short label names like in PCA plot - unify the list of args across pcaplot.ils and dendrogram.ils. Make sure labeling and color picking is done in the same location (either inside or outside the function)

sample.labels <- stri_replace(colnames(dat.nonnorm.summ.w2 %>% select(-Protein)), fixed='Mixture', 'Mix')

par(mfrow=c(2, 2))
  dendrogram.ils(dat.nonnorm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), sample.labels, cols.vec, 'Raw')

for (i in seq_along(dat.norm.summ.w2)){
    dendrogram.ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), sample.labels, cols.vec, paste('Normalized', variant.names[i], sep='_'))
}

par(mfrow=c(1, 1))  

5 DEA component

5.1 Moderated t-test

TODO: - Also try to log-transform the intensity case, to see if there are large differences in the t-test results. - done. remove this code? NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).

design.matrix <- get_design_matrix(referenceCondition, study.design)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in seq_along(dat.norm.summ.w2)) {
  this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
  d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[i]]), 'Protein')
  dat.dea[[i]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)
}
# also see what the unnormalized results would look like
n.comp.variants <- n.comp.variants + 1
variant.names <- c(variant.names, 'raw')
scale.vec <- c(scale.vec, 'raw')
dat.dea$raw <- moderated_ttest(dat=column_to_rownames(dat.nonnorm.summ.w2, 'Protein'), design.matrix, scale='raw')

6 Results comparison

6.1 Confusion matrix

cm <- conf.mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print.conf.mat(cm)
0.667
CONSTANd
NOMAD
medianSweeping
raw
background spiked background spiked background spiked background spiked
not_DE 4063 15 4064 19 4064 18 4064 19
DE 1 4 0 0 0 1 0 0
0.667
CONSTANd NOMAD medianSweeping raw
Accuracy 0.996 0.995 0.996 0.995
Sensitivity 0.211 0.000 0.053 0.000
Specificity 1.000 1.000 1.000 1.000
PPV 0.800 NaN 1.000 NaN
NPV 0.996 0.995 0.996 0.995
0.125
CONSTANd
NOMAD
medianSweeping
raw
background spiked background spiked background spiked background spiked
not_DE 4062 5 4064 15 4062 13 4064 19
DE 2 14 0 4 2 6 0 0
0.125
CONSTANd NOMAD medianSweeping raw
Accuracy 0.998 0.996 0.996 0.995
Sensitivity 0.737 0.211 0.316 0.000
Specificity 1.000 1.000 1.000 1.000
PPV 0.875 1.000 0.750 NaN
NPV 0.999 0.996 0.997 0.995
1
CONSTANd
NOMAD
medianSweeping
raw
background spiked background spiked background spiked background spiked
not_DE 4055 3 4064 17 4063 10 4064 16
DE 9 16 0 2 1 9 0 3
1
CONSTANd NOMAD medianSweeping raw
Accuracy 0.997 0.996 0.997 0.996
Sensitivity 0.842 0.105 0.474 0.158
Specificity 0.998 1.000 1.000 1.000
PPV 0.640 1.000 0.900 1.000
NPV 0.999 0.996 0.998 0.996

6.2 Scatter plots

TO DO: Piotr: constant NOMAD i RAW q-values (approx. 1) generate error in scatterplots

# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
q.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)

#scatterplot.ils(dat.dea, q.cols, 'p-values') # commented due to error, sd=0 for NOMAD and RAW
scatterplot.ils(dat.dea, logFC.cols, 'log2FC')

6.3 Volcano plots

for (i in 1:n.contrasts){
  volcanoplot.ils(dat.dea, i, spiked.proteins)
}

6.4 Violin plots

Let’s see whether the spiked protein fold changes make sense

# plot theoretical value (horizontal lines) and violin per condition
dat.spiked.logfc <- lapply(dat.dea, function(x) x[spiked.proteins,logFC.cols])
dat.spiked.logfc.l <- lapply(dat.spiked.logfc, function(x) {
  x %>% rename_with(function(y) sapply(y, function(z) strsplit(z, '_')[[1]][2])) %>% pivot_longer(cols = everything(), names_to = 'condition', values_to = 'logFC') %>% add_column(Protein=rep(rownames(x), each=length(colnames(x)))) })
violinplot.ils(lapply(dat.spiked.logfc.l, filter, condition != referenceCondition))

7 Conclusions

8 Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_BE.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_BE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_BE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_BE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] dendextend_1.14.0 NOMAD_0.99.0      dplR_1.7.1        CONSTANd_0.99.4  
##  [5] forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4      
##  [9] readr_1.4.0       tidyr_1.1.2       tibble_3.0.4      tidyverse_1.3.0  
## [13] limma_3.45.19     psych_2.0.9       kableExtra_1.3.1  gridExtra_2.3    
## [17] stringi_1.5.3     ggplot2_3.3.2    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-150         matrixStats_0.57.0   fs_1.5.0            
##  [4] lubridate_1.7.9      webshot_0.5.2        httr_1.4.2          
##  [7] tools_4.0.3          backports_1.1.10     R6_2.4.1            
## [10] rpart_4.1-15         DBI_1.1.0            mgcv_1.8-33         
## [13] colorspace_1.4-1     nnet_7.3-14          withr_2.3.0         
## [16] tidyselect_1.1.0     mnormt_2.0.2         compiler_4.0.3      
## [19] cli_2.1.0            rvest_0.3.6          xml2_1.3.2          
## [22] labeling_0.4.2       scales_1.1.1         digest_0.6.27       
## [25] rmarkdown_2.5        R.utils_2.10.1       pkgconfig_2.0.3     
## [28] htmltools_0.5.0      highr_0.8            dbplyr_1.4.4        
## [31] rlang_0.4.8          readxl_1.3.1         rstudioapi_0.11     
## [34] generics_0.0.2       farver_2.0.3         jsonlite_1.7.1      
## [37] ModelMetrics_1.2.2.2 R.oo_1.24.0          magrittr_1.5        
## [40] Matrix_1.2-18        Rcpp_1.0.5           munsell_0.5.0       
## [43] fansi_0.4.1          viridis_0.5.1        lifecycle_0.2.0     
## [46] R.methodsS3_1.8.1    pROC_1.16.2          yaml_2.2.1          
## [49] MASS_7.3-53          plyr_1.8.6           recipes_0.1.14      
## [52] grid_4.0.3           blob_1.2.1           parallel_4.0.3      
## [55] crayon_1.3.4         lattice_0.20-41      haven_2.3.1         
## [58] splines_4.0.3        hms_0.5.3            tmvnsim_1.0-2       
## [61] knitr_1.30           pillar_1.4.6         stats4_4.0.3        
## [64] reshape2_1.4.4       codetools_0.2-16     reprex_0.3.0        
## [67] XML_3.99-0.5         glue_1.4.2           evaluate_0.14       
## [70] data.table_1.13.2    modelr_0.1.8         png_0.1-7           
## [73] vctrs_0.3.4          foreach_1.5.1        cellranger_1.1.0    
## [76] gtable_0.3.0         assertthat_0.2.1     gower_0.2.2         
## [79] xfun_0.18            prodlim_2019.11.13   broom_0.7.2         
## [82] e1071_1.7-4          survival_3.2-7       class_7.3-17        
## [85] viridisLite_0.3.0    timeDate_3043.102    signal_0.7-6        
## [88] iterators_1.0.13     lava_1.6.8           ellipsis_0.3.1      
## [91] caret_6.0-86         ipred_0.9-9